In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
from warnings import filterwarnings
from collections import Counter

# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
import missingno as msno

# Data Pre-processing Libraries
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

# Modelling Libraries
from sklearn.linear_model import LogisticRegression,RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC,LinearSVC,NuSVC
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.ensemble import VotingClassifier

# Evaluation & CV Libraries
from sklearn.metrics import precision_score,accuracy_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,RepeatedStratifiedKFold
In [2]:
pd.read_csv('water_potability.csv')
Out[2]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [3]:
df=pd.read_csv('water_potability.csv')
In [4]:
df
Out[4]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [5]:
df.head()
Out[5]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
In [6]:
df.describe()
Out[6]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [8]:
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)


d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
            color_discrete_sequence=[colors_green[3],colors_blue[3]],
             labels={'label':'Potability','Potability':'No. Of Samples'})

fig.add_annotation(text='We can resample the data<br> to get a balanced dataset',
                   x=1.2,y=0.9,showarrow=False,font_size=12,opacity=0.7,font_family='monospace')
fig.add_annotation(text='Potability',
                   x=0.5,y=0.5,showarrow=False,font_size=14,opacity=0.7,font_family='monospace')

fig.update_layout(
    font_family='monospace',
    title=dict(text='Q. How many samples of water are Potable?',x=0.47,y=0.98,
               font=dict(color=colors_dark[2],size=20)),
    legend=dict(x=0.37,y=-0.05,orientation='h',traceorder='reversed'),
    hoverlabel=dict(bgcolor='white'))

fig.update_traces(textposition='outside', textinfo='percent+label')

fig.show()
In [9]:
#Hardenss of water: 
#The simple definition of water hardness is the amount of dissolved calcium and magnesium in the water. 
#Hard water is high in dissolved minerals, largely calcium and magnesium. You may have felt the effects of hard water, 
#literally, the last time you washed your hands. Depending on the hardness of your water, after using soap to wash you may 
#have felt like there was a film of residue left on your hands. In hard water, soap reacts with the calcium 
#(which is relatively high in hard water) to form "soap scum". When using hard water, more soap or detergent is needed 
#to get things clean, be it your hands, hair, or your laundry.#Hardenss of water: The simple definition of water hardness 
#is the amount of dissolved calcium and magnesium in the water. Hard water is high in dissolved minerals, largely calcium and 
#magnesium. You may have felt the effects of hard water, literally, the last time you washed your hands. 
#Depending on the hardness of your water, after using soap to wash you may have felt like there was a film of residue left 
#on your hands. In hard water, soap reacts with the calcium (which is relatively high in hard water) to form "soap scum". 
#When using hard water, more soap or detergent is needed to get things clean, be it your hands, hair, or your laundry.
In [10]:
fig = px.histogram(df, x='Hardness', y=Counter(df['Hardness']), color='Potability', template='plotly_white', 
                   marginal='box',opacity=.7, nbins=100, color_discrete_sequence=[colors_green[3],colors_blue[3]], 
                   barmode='group', histfunc='count')
fig.add_vline(x=151, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7) 
fig.add_vline(x=301, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7) 
fig.add_vline(x=76, line_width=1, line_color=colors_dark[1],line_dash='dot', opacity=.7) 

fig.add_annotation(text='<76 mg/l is <br> considered soft', x=40, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='Between 76 and 150 <br> (mg/L is considered <br> moderately hard', x=113, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='Between 151 and 300(mg/L)<br> is considered Hard', x=250, y=130, showarrow=False, font_size=9)
fig.add_annotation(text='>300 mg/L is <br> considered very Hard', x=340, y=130, showarrow=False, font_size=9)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Hard Distrisbution', x=.53, y=.95,
            font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Hardness (mg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1, y=.96, bordercolor=colors_dark[4],borderwidth=0, tracegroupgap=5),
    bargap=.3
)
In [11]:
#pH level: 
#The pH of water is a measure of the acid–base equilibrium and, in most natural waters, 
#is controlled by the carbon dioxide–bicarbonate–carbonate equilibrium system. An increased carbon dioxide concentration 
#will therefore lower pH, whereas a decrease will cause it to rise. Temperature will also affect the equilibria and the pH. 
#In pure water, a decrease in pH of about 0.45 occurs as the temperature is raised by 25 °C. In water with a buffering capacity 
#imparted by bicarbonate, carbonate and hydroxyl ions, this temperature effect is modified (APHA, 1989). 
#The pH of most drinking-water lies within the range 6.5–8.5. 
#Natural waters can be of lower pH, as a result of, for example, acid rain or higher pH in limestone areas.
In [12]:
fig = px.histogram(df,x='ph',y=Counter(df['ph']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_vline(x=7, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig.add_annotation(text='<7 is Acidic',x=4,y=70,showarrow=False,font_size=10)
fig.add_annotation(text='>7 is Basic',x=10,y=70,showarrow=False,font_size=10)


fig.update_layout(
    font_family='monospace',
    title=dict(text='pH Level Distribution',x=0.5,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='pH Level',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [13]:
#TDS: TDS means concentration of dissolved particles or solids in water. 
#TDS comprises of inorganic salts such as calcium, magnesium, chlorides, sulfates, bicarbonates, etc, 
#along with many more inorganic compounds that easily dissolve in water.
In [14]:
fig2 =px.histogram(df, x='Solids', y=Counter(df['Solids']),color = 'Potability', template='plotly_white',
                   marginal='box', opacity=.7, nbins=100, color_discrete_sequence=[colors_green[3],colors_blue[3]],
                   barmode='group', histfunc='count')
fig2.update_layout(
    font_family='monospace', 
    title=dict(text="Distribution of Total Dissolved Solids", x=.5, y=.95, font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Dissolved Solids(ppm)',
    yaxis_title_text='Count',
    legend=dict(x=1, y=.96, bordercolor=colors_dark[4] ,borderwidth=0, tracegroupgap=5), 
    bargap=.3)
In [15]:
#Sulfate: Sulfate (SO4) can be found in almost all natural water. 
#The origin of most sulfate compounds is the oxidation of sulfite ores, the presence of shales, 
#or the industrial wastes. Sulfate is one of the major dissolved components of rain. 
#High concentrations of sulfate in the water we drink can have a laxative effect when combined with calcium and magnesium, 
#the two most common constituents of hardness.
In [16]:
fig = px.histogram(df,x='Sulfate',y=Counter(df['Sulfate']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Sulfate (mg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [17]:
#Chloramines: Chloramines (also known as secondary disinfection) are disinfectants used to treat drinking water and they:

#Are most commonly formed when ammonia is added to chlorine to treat drinking water.
#Provide longer-lasting disinfection as the water moves through pipes to consumers.
#Chloramines have been used by water utilities since the 1930s.
In [18]:
fig3 = px.histogram(df,x='Chloramines',y=Counter(df['Chloramines']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig3.add_vline(x=4, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig3.add_annotation(text='<4 ppm is considered<br> safe for drinking',x=1.8,y=90,showarrow=False)

fig3.update_layout(
    font_family='monospace',
    title=dict(text='Chloramines Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Chloramines (ppm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig3.show()
In [19]:
fig4 = px.histogram(df,x='Sulfate',y=Counter(df['Sulfate']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig4.add_vline(x=250, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig4.add_annotation(text='<250 mg/L is considered<br> safe for drinking',x=175,y=90,showarrow=False)

fig4.update_layout(
    font_family='monospace',
    title=dict(text='Sulfate Distribution',x=0.53,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Sulfate (mg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig4.show()
In [20]:
#Conductivity: Conductivity is a measure of the ability of water to pass an electrical current. 
#Because dissolved salts and other inorganic chemicals conduct electrical current, conductivity increases as salinity increases. 
#Organic compounds like oil do not conduct electrical current very well and therefore have a low conductivity when in water. 
#Conductivity is also affected by temperature: the warmer the water, the higher the conductivity.
In [21]:
fig = px.histogram(df,x='Conductivity',y=Counter(df['Conductivity']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_annotation(text='The Conductivity range <br> is safe for both (200-800),<br> Potable and Non-Potable water',
                   x=600,y=90,showarrow=False)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Conductivity Distribution',x=0.5,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Conductivity (μS/cm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [22]:
#Organic Carbon: Organic contaminants (natural organic substances, insecticides, herbicides, and other agricultural chemicals) 
#enter waterways in rainfall runoff. Domestic and industrial wastewaters also contribute organic contaminants in various amounts. 
#As a result of accidental spills or leaks, industrial organic wastes may enter streams. Some of the contaminants may not be 
#completely removed by treatment processes; therefore, they could become a problem for drinking water sources. 
#It is important to know the organic content in a waterway.
In [23]:
fig = px.histogram(df,x='Organic_carbon',y=Counter(df['Organic_carbon']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_vline(x=10, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig.add_annotation(text='Typical Organic Carbon<br> level is upto 10 ppm',x=5.3,y=110,showarrow=False)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Organic Carbon Distribution',x=0.5,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Organic Carbon (ppm)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [24]:
#Trihalomethanes: Trihalomethanes (THMs) are the result of a reaction between the chlorine used for disinfecting tap water and 
#natural organic matter in the water. 
#At elevated levels, THMs have been associated with negative health effects such as cancer and adverse reproductive outcomes.
In [25]:
fig = px.histogram(df,x='Trihalomethanes',y=Counter(df['Trihalomethanes']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_vline(x=80, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig.add_annotation(text='Upper limit of Trihalomethanes<br> level is 80 μg/L',x=115,y=90,showarrow=False)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Trihalomethanes Distribution',x=0.5,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Trihalomethanes (μg/L)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [26]:
#Turbidity: Turbidity is the measure of relative clarity of a liquid. 
#It is an optical characteristic of water and is a measurement of the amount of light that is scattered by material in the water
#when a light is shined through the water sample. The higher the intensity of scattered light, the higher the turbidity. 
#Material that causes water to be turbid include clay, silt, very tiny inorganic and organic matter, algae, 
#dissolved colored organic compounds, and plankton and other microscopic organisms.
In [27]:
fig = px.histogram(df,x='Turbidity',y=Counter(df['Turbidity']),color='Potability',template='plotly_white',
                  marginal='box',opacity=0.7,nbins=100,color_discrete_sequence=[colors_green[3],colors_blue[3]],
                  barmode='group',histfunc='count')

fig.add_vline(x=5, line_width=1, line_color=colors_dark[1],line_dash='dot',opacity=0.7)

fig.add_annotation(text='<5 NTU Turbidity is<br> considered safe',x=6,y=90,showarrow=False)

fig.update_layout(
    font_family='monospace',
    title=dict(text='Turbidity Distribution',x=0.5,y=0.95,
               font=dict(color=colors_dark[2],size=20)),
    xaxis_title_text='Turbidity (NTU)',
    yaxis_title_text='Count',
    legend=dict(x=1,y=0.96,bordercolor=colors_dark[4],borderwidth=0,tracegroupgap=5),
    bargap=0.3,
)
fig.show()
In [28]:
fig = px.scatter_matrix(df,df.drop('Potability',axis=1),height=1250,width=1250,template='plotly_white',opacity=0.7,
                        color_discrete_sequence=[colors_blue[3],colors_green[3]],color='Potability',
                       symbol='Potability',color_continuous_scale=[colors_green[3],colors_blue[3]])

fig.update_layout(font_family='monospace',font_size=10,
                  coloraxis_showscale=False,
                 legend=dict(x=0.02,y=1.07,bgcolor=colors_dark[4]),
                 title=dict(text='Scatter Plot Matrix b/w Features',x=0.5,y=0.97,
                   font=dict(color=colors_dark[2],size=24)))
fig.show()
In [29]:
cor=df.drop('Potability',axis=1).corr()
cor
Out[29]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity
ph 1.000000 0.082096 -0.089288 -0.034350 0.018203 0.018614 0.043503 0.003354 -0.039057
Hardness 0.082096 1.000000 -0.046899 -0.030054 -0.106923 -0.023915 0.003610 -0.013013 -0.014449
Solids -0.089288 -0.046899 1.000000 -0.070148 -0.171804 0.013831 0.010242 -0.009143 0.019546
Chloramines -0.034350 -0.030054 -0.070148 1.000000 0.027244 -0.020486 -0.012653 0.017084 0.002363
Sulfate 0.018203 -0.106923 -0.171804 0.027244 1.000000 -0.016121 0.030831 -0.030274 -0.011187
Conductivity 0.018614 -0.023915 0.013831 -0.020486 -0.016121 1.000000 0.020966 0.001285 0.005798
Organic_carbon 0.043503 0.003610 0.010242 -0.012653 0.030831 0.020966 1.000000 -0.013274 -0.027308
Trihalomethanes 0.003354 -0.013013 -0.009143 0.017084 -0.030274 0.001285 -0.013274 1.000000 -0.022145
Turbidity -0.039057 -0.014449 0.019546 0.002363 -0.011187 0.005798 -0.027308 -0.022145 1.000000
In [30]:
fig = px.imshow(cor,height=800,width=800,color_continuous_scale=colors_blue,template='plotly_white')

fig.update_layout(font_family='monospace',
                title=dict(text='Correlation Heatmap',x=0.5,y=0.93,
                             font=dict(color=colors_dark[2],size=24)),
                coloraxis_colorbar=dict(len=0.85,x=1.1) 
                 )

fig.show()
In [31]:
#dealing with missing values
In [32]:
fig = msno.matrix(df,color=(0,0.5,0.5))
In [33]:
df.isnull().sum()
Out[33]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [34]:
df[df['Potability']==0].describe()
Out[34]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 1684.000000 1998.000000 1998.000000 1998.000000 1510.000000 1998.000000 1998.000000 1891.000000 1998.000000 1998.0
mean 7.085378 196.733292 21777.490788 7.092175 334.564290 426.730454 14.364335 66.303555 3.965800 0.0
std 1.683499 31.057540 8543.068788 1.501045 36.745549 80.047317 3.334554 16.079320 0.780282 0.0
min 0.000000 98.452931 320.942611 1.683993 203.444521 181.483754 4.371899 0.738000 1.450000 0.0
25% 6.037723 177.823265 15663.057382 6.155640 311.264006 368.498530 12.101057 55.706530 3.444062 0.0
50% 7.035456 197.123423 20809.618280 7.090334 333.389426 422.229331 14.293508 66.542198 3.948076 0.0
75% 8.155510 216.120687 27006.249009 8.066462 356.853897 480.677198 16.649485 77.277704 4.496106 0.0
max 14.000000 304.235912 61227.196008 12.653362 460.107069 753.342620 28.300000 120.030077 6.739000 0.0
In [35]:
df[df['Potability']==1].describe()
Out[35]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 1101.000000 1278.000000 1278.000000 1278.000000 985.000000 1278.000000 1278.000000 1223.000000 1278.000000 1278.0
mean 7.073783 195.800744 22383.991018 7.169338 332.566990 425.383800 14.160893 66.539684 3.968328 1.0
std 1.448048 35.547041 9101.010208 1.702988 47.692818 82.048446 3.263907 16.327419 0.780842 0.0
min 0.227499 47.432000 728.750830 0.352000 129.000000 201.619737 2.200000 8.175876 1.492207 1.0
25% 6.179312 174.330531 15668.985035 6.094134 300.763772 360.939023 12.033897 56.014249 3.430909 1.0
50% 7.036752 196.632907 21199.386614 7.215163 331.838167 420.712729 14.162809 66.678214 3.958576 1.0
75% 7.933068 218.003420 27973.236446 8.199261 365.941346 484.155911 16.356245 77.380975 4.509569 1.0
max 13.175402 323.124000 56488.672413 13.127000 481.030642 695.369528 23.604298 124.000000 6.494249 1.0
In [36]:
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].median()
Out[36]:
ph                   7.035456
Sulfate            333.389426
Trihalomethanes     66.542198
dtype: float64
In [38]:
df[df['Potability']==0][['ph','Sulfate','Trihalomethanes']].mean()
Out[38]:
ph                   7.085378
Sulfate            334.564290
Trihalomethanes     66.303555
dtype: float64
In [39]:
# the mean and median is nearly identical 
In [42]:
df['ph'].fillna(value=df['ph'].median(),inplace=True)
df['Sulfate'].fillna(value=df['Sulfate'].median(),inplace=True)
df['Trihalomethanes'].fillna(value=df['Trihalomethanes'].median(),inplace=True)
#filling all null values with the median value. 
In [41]:
df.isnull().sum()
Out[41]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
In [43]:
X = df.drop('Potability',axis=1).values
In [44]:
X
Out[44]:
array([[7.03675210e+00, 2.04890455e+02, 2.07913190e+04, ...,
        1.03797831e+01, 8.69909705e+01, 2.96313538e+00],
       [3.71608008e+00, 1.29422921e+02, 1.86300579e+04, ...,
        1.51800131e+01, 5.63290763e+01, 4.50065627e+00],
       [8.09912419e+00, 2.24236259e+02, 1.99095417e+04, ...,
        1.68686369e+01, 6.64200925e+01, 3.05593375e+00],
       ...,
       [9.41951032e+00, 1.75762646e+02, 3.31555782e+04, ...,
        1.10390697e+01, 6.98454003e+01, 3.29887550e+00],
       [5.12676292e+00, 2.30603758e+02, 1.19838694e+04, ...,
        1.11689462e+01, 7.74882131e+01, 4.70865847e+00],
       [7.87467136e+00, 1.95102299e+02, 1.74041771e+04, ...,
        1.61403676e+01, 7.86984463e+01, 2.30914906e+00]])
In [45]:
y = df['Potability'].values
In [46]:
y
Out[46]:
array([0, 0, 0, ..., 1, 1, 1], dtype=int64)
In [47]:
df
Out[47]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 7.036752 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 333.073546 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 333.073546 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 333.073546 392.449580 19.903225 66.622485 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 333.073546 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 333.073546 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 333.073546 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [48]:
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=.3, random_state=101)
In [50]:
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
In [61]:
filterwarnings('ignore')
models=[("LR", LogisticRegression(max_iter=1000)), ("SVC",SVC()),("KNN", KNeighborsClassifier(n_neighbors=10)),
        ("DTC",DecisionTreeClassifier()), ("GNB",GaussianNB()),("SGDC", SGDClassifier()), ("Perc", Perceptron()),
       ("NC", NearestCentroid()), ("Ridge",RidgeClassifier()), ("BNB",BernoulliNB()), ("RF", RandomForestClassifier()),
       ("ADA", AdaBoostClassifier()), ("XGB", GradientBoostingClassifier()), ('PAC', PassiveAggressiveClassifier())]

results= []
names = []
finalresults=[]

for name, model in models:
    model.fit(X_train, y_train)
    model_results=model.predict(X_test)
    score=precision_score(y_test, model_results, average='macro')
    results.append(score)
    names.append(name)
    finalresults.append((name,score))

finalresults.sort(key=lambda k:k[1], reverse=True)
In [62]:
finalresults
Out[62]:
[('SVC', 0.6966295854853661),
 ('RF', 0.6523135493947676),
 ('XGB', 0.6437883435582822),
 ('GNB', 0.611372136512389),
 ('KNN', 0.610276073619632),
 ('DTC', 0.5827673163190308),
 ('ADA', 0.5600032379487594),
 ('SGDC', 0.5412043458159963),
 ('Perc', 0.5166988189308038),
 ('NC', 0.4991565832435637),
 ('PAC', 0.45516738959320724),
 ('LR', 0.30671414038657174),
 ('Ridge', 0.30671414038657174),
 ('BNB', 0.30671414038657174)]
In [64]:
model_params = { 
    'XGB': {
            'model': GradientBoostingClassifier(),
            'params': {
                'learning_rate' : [.0001,.001, .01, .1],
                'n_estimators' : [100,200,500,1000],
                'max_features': ['sqrt','log2'],
                'max_depth':list(range(11))
            }
        },
    'Random Forest':
    {
        'model':RandomForestClassifier(),
        'params':
        {
            'n_estimators':[10,50,100,200],
            'max_features':['auto','sqrt','log2'],
            'max_depth':list(range(1,11))
        }
    }
}
In [67]:
cv= RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
scores=[]
for model_name, params in model_params.items(): 
    rs = RandomizedSearchCV(params['model'], params['params'],cv=cv, n_iter=20)
    rs.fit(X,y)
    scores.append([model_name, dict(rs.best_params_), rs.best_score_])
data=pd.DataFrame(scores, columns=['Model','Parameters','Score'])
data
Out[67]:
Model Parameters Score
0 XGB {'n_estimators': 1000, 'max_features': 'log2',... 0.665446
1 Random Forest {'n_estimators': 200, 'max_features': 'auto', ... 0.664225
In [68]:
param=data['Parameters']
model = VotingClassifier(estimators=[
                                     ('XGB',GradientBoostingClassifier(**param[0])),
                                     ('RF',RandomForestClassifier(**param[1])),
                                    ],voting='hard')

accuracy=[]
scaler = StandardScaler()
skf = RepeatedStratifiedKFold(n_splits=5,n_repeats=2)
skf.get_n_splits(X,y)

for train_index, test_index in skf.split(X,y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    model.fit(X_train,y_train)
    predictions=model.predict(X_test)
    score=accuracy_score(y_test,predictions)
    accuracy.append(score)    
In [69]:
np.mean(accuracy)
Out[69]:
0.6689548035747532
In [70]:
"""
The TDS levels seem to contain some descripency since its values are on an average 40 folds more than the upper limit for safe drinking water.

The data contains almost equal number of acidic and basic pH level water samples.

92% of the data was considered Hard.

Only 2% of the water samples were safe in terms of Chloramines levels.

Only 1.8% of the water samples were safe in terms of Sulfate levels.

90.6% of the water samples had higher Carbon levels than the typical Carbon levels in drinking water (10 ppm).

76.6% of water samples were safe for drinking in terms of Trihalomethane levels in water.

90.4% of the water samples were safe for drinking in terms of the Turbidity of water samples.

The correlation coefficients between the features were very low.

Random Forest and XGBoost worked the best to train the model.

The ensemble method of using the Voting Classfier on Stratified K-folded samples gave an accuracy of >64%
"""
Out[70]:
'\nThe TDS levels seem to contain some descripency since its values are on an average 40 folds more than the upper limit for safe drinking water.\n\nThe data contains almost equal number of acidic and basic pH level water samples.\n\n92% of the data was considered Hard.\n\nOnly 2% of the water samples were safe in terms of Chloramines levels.\n\nOnly 1.8% of the water samples were safe in terms of Sulfate levels.\n\n90.6% of the water samples had higher Carbon levels than the typical Carbon levels in drinking water (10 ppm).\n\n76.6% of water samples were safe for drinking in terms of Trihalomethane levels in water.\n\n90.4% of the water samples were safe for drinking in terms of the Turbidity of water samples.\n\nThe correlation coefficients between the features were very low.\n\nRandom Forest and XGBoost worked the best to train the model.\n\nThe ensemble method of using the Voting Classfier on Stratified K-folded samples gave an accuracy of >64%\n'
In [ ]: